Introduction

Code and results. Full report at:

NB This is a bare bones .Rmd file. Please see the report for full details.

This run assumes:

  • HEEP2full = 280
  • HEEP2fullAvail = 700
  • HEEP2poolOb = 2800

Data

Load the data - sourced from https://reshare.ukdataservice.ac.uk/853334/

dataPath <- "~/temp/"
# half-hourly total household consumption (pre-prepared)
totalF <- paste0(dataPath, "halfHourImputedTotalDemand_Fixed.csv.gz")

# half-hourly lighting consumption
lightingF <- paste0(dataPath, "halfHourLighting.csv.gz")

# half-hourly heat pump consumption
heatPumpF <- paste0(dataPath, "halfHourHeatPump.csv.gz")

# household attribute data
hhF <- paste0(dataPath, "ggHouseholdAttributesSafe_2019-04-09.csv.gz")

Now prep and check the data

# > power ----
totalDT <- data.table::fread(totalF)
setkey(totalDT, linkID)
uniqueN(totalDT$linkID)
## [1] 40
heatPumpDT <- data.table::fread(heatPumpF)
uniqueN(heatPumpDT$linkID)
## [1] 29
setkey(heatPumpDT, linkID)

lightingDT <- data.table::fread(lightingF)
uniqueN(lightingDT$linkID)
## [1] 23
setkey(lightingDT, linkID)

powerDataPrep <- function(dt){
  dt[, meanConsumptionkWh := (meanPowerW/2)/1000]
  dt[, r_as_dateTime := lubridate::as_datetime(r_dateTimeHalfHour)]
  dt[, r_dateTimeNZ := lubridate::with_tz(r_as_dateTime, "Pacific/Auckland")]
  dt[, r_date := lubridate::as_date(r_dateTimeNZ)]
  dt[, r_halfHour := hms::as_hms(r_dateTimeNZ)]
  dt <- addNZSeason(dt, dateTime = "r_dateTimeNZ")
  return(dt)
}

totalDT <- powerDataPrep(totalDT)
heatPumpDT <- powerDataPrep(heatPumpDT)
lightingDT <- powerDataPrep(lightingDT)

Check that the time of day demand profiles look OK.

# check - beware which hemisphere we are in?
table(totalDT$month, totalDT$season)
##      
##       Spring Summer Autumn Winter
##   Jan      0 121570      0      0
##   Feb      0 113668      0      0
##   Mar      0      0 128231      0
##   Apr      0      0 138259      0
##   May      0      0 138250      0
##   Jun      0      0      0 141182
##   Jul      0      0      0 151549
##   Aug      0      0      0 138193
##   Sep 127962      0      0      0
##   Oct 130546      0      0      0
##   Nov 120549      0      0      0
##   Dec      0 119124      0      0
testDT <- totalDT[, .(meankWh = mean(meanConsumptionkWh)), 
                  keyby = .(r_halfHour, season)]

ggplot2::ggplot(testDT, aes(x = r_halfHour, y = meankWh, colour = season)) + 
  geom_line() +
  labs(caption = "Whole household kWh")

testDT <- lightingDT[, .(meankWh = mean(meanConsumptionkWh)), 
                  keyby = .(r_halfHour, season)]
ggplot2::ggplot(testDT, aes(x = r_halfHour, y = meankWh, colour = season)) + 
  geom_line() +
  labs(caption = "Lighting kWh where known")

# looks OK
message("# half-hour: whole household kWh")
## # half-hour: whole household kWh
summary(totalDT)
##     linkID            circuit          r_dateTimeHalfHour           
##  Length:1569083     Length:1569083     Min.   :2014-01-06 03:00:00  
##  Class :character   Class :character   1st Qu.:2015-06-07 10:00:00  
##  Mode  :character   Mode  :character   Median :2016-03-10 10:00:00  
##                                        Mean   :2016-04-27 11:10:59  
##                                        3rd Qu.:2017-03-09 23:30:00  
##                                        Max.   :2018-08-01 11:30:00  
##                                                                     
##       nObs         meanPowerW         sdPowerW         minPowerW       
##  Min.   : 1.00   Min.   :-7627.7   Min.   :   0.00   Min.   :-12407.5  
##  1st Qu.:30.00   1st Qu.:  260.5   1st Qu.:  62.35   1st Qu.:   127.6  
##  Median :30.00   Median :  527.7   Median : 147.06   Median :   252.4  
##  Mean   :30.26   Mean   :  894.8   Mean   : 410.87   Mean   :   427.9  
##  3rd Qu.:30.00   3rd Qu.: 1259.9   3rd Qu.: 706.25   3rd Qu.:   529.9  
##  Max.   :60.00   Max.   :10981.6   Max.   :4954.00   Max.   :  9600.6  
##                                    NA's   :92                          
##    maxPowerW       meanConsumptionkWh r_as_dateTime                
##  Min.   :-6970.9   Min.   :-3.8138    Min.   :2014-01-06 03:00:00  
##  1st Qu.:  395.5   1st Qu.: 0.1302    1st Qu.:2015-06-07 10:00:00  
##  Median : 1003.4   Median : 0.2639    Median :2016-03-10 10:00:00  
##  Mean   : 1702.7   Mean   : 0.4474    Mean   :2016-04-27 11:10:59  
##  3rd Qu.: 2746.7   3rd Qu.: 0.6299    3rd Qu.:2017-03-09 23:30:00  
##  Max.   :15593.2   Max.   : 5.4908    Max.   :2018-08-01 11:30:00  
##                                                                    
##   r_dateTimeNZ                     r_date            r_halfHour      
##  Min.   :2014-01-06 16:00:00   Min.   :2014-01-06   Length:1569083   
##  1st Qu.:2015-06-07 22:00:00   1st Qu.:2015-06-07   Class1:hms       
##  Median :2016-03-10 23:00:00   Median :2016-03-10   Class2:difftime  
##  Mean   :2016-04-27 23:10:59   Mean   :2016-04-27   Mode  :numeric   
##  3rd Qu.:2017-03-10 12:30:00   3rd Qu.:2017-03-10                    
##  Max.   :2018-08-01 23:30:00   Max.   :2018-08-01                    
##                                                                      
##      month           season      
##  Jul    :151549   Spring:379057  
##  Jun    :141182   Summer:354362  
##  Apr    :138259   Autumn:404740  
##  May    :138250   Winter:430924  
##  Aug    :138193                  
##  Oct    :130546                  
##  (Other):731104

Process and check household data

# > household ----
hhDT <- data.table::fread(hhF)
hhDT <- code_Q7(hhDT)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
hhDT[, hasPV := `PV Inverter`]
hhDT[, hasPV := ifelse(hasPV == "", "No", hasPV)]

# check
uniqueN(hhDT$linkID)
## [1] 44
uniqueN(totalDT$linkID)
## [1] 40
setkey(hhDT, linkID)
setkey(totalDT, linkID)

#> data checks ----

# any zeros & negative numbers?
hist(totalDT$meanConsumptionkWh)

# some

# check if aggregated to daily sum
dailyAllDT <- totalDT[, .(sumkWh = sum(meanConsumptionkWh)), keyby = .(r_date, linkID)]
hist(dailyAllDT$sumkWh)

# still some neg - probably due to PV?

# check if aggregated to daily mean
dt <- dailyAllDT[, .(meankWh = mean(sumkWh),
                     sdkWh = sd(sumkWh)), keyby = .(linkID)]
hist(dt$meankWh)

# still some

Check if the negative values are to do with PV

# need to check -ve = mid-day, if not is not PV must just be errors?
totalDT[, testValues := "> 0"]
totalDT[, testValues := ifelse(meanConsumptionkWh == 0, "0", testValues)]
totalDT[, testValues := ifelse(meanConsumptionkWh < 0, "< 0", testValues)]
testDT <- totalDT[hhDT[, .(linkID, hasPV)]]
dt <- testDT[testValues == "< 0", .(nValues = .N),
                  keyby = .(r_halfHour, season, linkID, testValues,hasPV)]

p <- ggplot2::ggplot(dt[!is.na(testValues) & !is.na(season)], aes(x = r_halfHour, y = nValues, colour = linkID)) +
  geom_line() +
  facet_grid(season ~ testValues + hasPV) +
  labs(x = "Half hour",
       y = "N",
       caption = "Colours = individual dwellings, legend omitted for clarity"
  ) +
  theme(legend.position="none")
p

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plotly::ggplotly(p)
# so:
# a) neg values indicate PV
# b) at least 1 dwelling had PV but didn't say so in survey - rf_06
hhDT[, hasPVfixed := ifelse(linkID == "rf_06", "yes", hasPV)]
hhDT[, .(n = .N), keyby = .(hasPV, hasPVfixed)]
##    hasPV hasPVfixed  n
## 1:    No         No 39
## 2:    No        yes  1
## 3:   yes        yes  4
# If we only care about network load we could keep all values > 0 only
# If we want total energy input then we need grid draw + PV input which we might be able
# to calculate from the PV circuit W - grid export
# but it gets complicated.
# So for now we'll leave out the dwellinmgs which seem to have PV
setkey(dailyAllDT, linkID)
dailyDT <- dailyAllDT[hhDT[hasPVfixed == "No"]]

dailyMeanDT <- dailyDT[!is.na(sumkWh), .(meankWh = mean(sumkWh, na.rm = TRUE),
                           sdkWh = sd(sumkWh, na.rm = TRUE)), keyby = .(linkID)]
setkey(dailyMeanDT, linkID)
#> final data ----
dailyMeanLinkedDT <- dailyMeanDT[hhDT]
dailyMeanLinkedDT <- dailyMeanLinkedDT[!is.na(meankWh)]

Statistical power calculations

make_p95Table <- function(dt,groupVar, kWh = "meankWh"){
  # aggregates kWh
  # remember that kWh could be defined in a range of ways (daily sum, mean etc)
  t95 <- dt[!is.na(Q7labAgg), .(meankWh = mean(get(kWh)),
                                           minkWh = min(get(kWh)),
                                           maxkWh = max(get(kWh)),
                                           sdkWh = sd(get(kWh)),
                                           nHouseholds = uniqueN(linkID)),
                       keyby = .(category = get(groupVar))]
  setnames(t95, c("category"), groupVar)
  t90 <- copy(t95) # has to be a copy
  
  t95[, Threshold := "p < 0.05 (observed n)"]
  t95[, se := sdkWh/sqrt(nHouseholds)]
  t95[, error := qnorm(0.975)*se]
  t95[, CI_lower := meankWh - error]
  t95[, CI_upper := meankWh + error]
  
  t90[, Threshold := "p < 0.1 (observed n)"]
  t90[, se := sdkWh/sqrt(nHouseholds)]
  t90[, error := qnorm(0.95)*se]
  t90[, CI_lower := meankWh - error]
  t90[, CI_upper := meankWh + error]
  
  return(rbind(t90, t95))
}

make_CIplot <- function(t, kwh = "meankWh", xVar, xLab){
  # plots whatever kWh is by xVar
  p <- ggplot2::ggplot(obsT, aes(x = get(xVar), y = kWh, fill = Threshold)) +
    geom_col() + 
    geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper)) +
    facet_wrap(Threshold ~  .) +
    labs(x = xLab,
         y = "Mean daily kWh"
    ) +
    theme(legend.position="bottom")
  ggplot2::ggsave(paste0(xVar,"_CI.png"), p, 
                  width = 6, path = here::here("plots"))
  return(p)
}

Dwelling age

Compare pre 2000 with after

obsT <- make_p95Table(dailyMeanLinkedDT[!is.na(Q7labAgg) & !is.na(meankWh)], 
                      groupVar = "Q7labAgg")

obsT[, kWh := meankWh]
make_CIplot(obsT, xVar = "Q7labAgg", xLab = "Dwelling age")
## Saving 6 x 5 in image

kableExtra::kable(obsT, digits = 3) %>%
  kable_styling()
Q7labAgg meankWh minkWh maxkWh sdkWh nHouseholds Threshold se error CI_lower CI_upper kWh
< 1999 23.253 9.921 45.924 10.094 22 p < 0.1 (observed n) 2.152 3.540 19.713 26.793 23.253
>= 2000 18.721 6.298 27.650 7.151 10 p < 0.1 (observed n) 2.261 3.720 15.001 22.441 18.721
< 1999 23.253 9.921 45.924 10.094 22 p < 0.05 (observed n) 2.152 4.218 19.035 27.471 23.253
>= 2000 18.721 6.298 27.650 7.151 10 p < 0.05 (observed n) 2.261 4.432 14.288 23.153 18.721
# What 'effect size' do we observe?
m1 <- obsT[Q7labAgg == "< 1999" & Threshold %like% "0.05", meankWh]
m2 <- obsT[Q7labAgg == ">= 2000" & Threshold %like% "0.05", meankWh]

# common sd??
r <- lm(meankWh ~ Q7labAgg, data = dailyMeanLinkedDT)
results <- summary(r) # we need the rse https://online.stat.psu.edu/stat462/node/94/
rse <- results$sigma # rse

diff <- abs(m1-m2)
# Difference
diff
## [1] 4.532435
d <- diff/rse

n1 <- obsT[Q7labAgg == "< 1999" & Threshold %like% "0.05", nHouseholds]
n2 <- obsT[Q7labAgg == ">= 2000" & Threshold %like% "0.05", nHouseholds]

p1 <- n1/(n1 + n2) # with

# what effect size would we need for the GG n? p = 0.05
pwr95 <- pwr::pwr.t2n.test(n1 = n1,
                  n2 = n2,
                  d = , sig.level = 0.05, power = 0.8)
pwr95$d
## [1] 1.104225
# which means a kWh difference of
pwr95$d * rse
## [1] 10.27974
# what effect size would we need for the GG n? p = 0.1
pwr90 <- pwr::pwr.t2n.test(n1 = n1,
                  n2 = n2,
                  d = , sig.level = 0.1, power = 0.8)
pwr90
## 
##      t test power calculation 
## 
##              n1 = 22
##              n2 = 10
##               d = 0.9704298
##       sig.level = 0.1
##           power = 0.8
##     alternative = two.sided
# which means a kWh difference of
pwr90$d * rse
## [1] 9.034182
# what effect size could we get with HEEPfull? p = 0.05
n1 <- p1 * HEEP2full
n2 <- HEEP2full - n1
pwr95_HEEP2full <- pwr::pwr.t2n.test(n1 = n1,
                  n2 = n2,
                  d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2full
## 
##      t test power calculation 
## 
##              n1 = 192.5
##              n2 = 87.5
##               d = 0.3624643
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
#HEEP2full
# which means a kWh difference of
pwr95_HEEP2full$d * rse
## [1] 3.374348
# what effect size could we get with HEEPfullAvail? p = 0.05
n1 <- p1 * HEEP2fullAvail
n2 <- HEEP2fullAvail - n1
pwr95_HEEP2fullAvail <- pwr::pwr.t2n.test(n1 = n1,
                                     n2 = n2,
                                     d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2fullAvail
## 
##      t test power calculation 
## 
##              n1 = 481.25
##              n2 = 218.75
##               d = 0.2287704
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
#HEEP2full
# which means a kWh difference of
pwr95_HEEP2fullAvail$d * rse
## [1] 2.12973
# what effect size could we get with HEEP2pool? p = 0.05
# HEEP2pool obtained - see table
n1 <- p1 * HEEP2poolOb
n2 <- HEEP2poolOb - n1
pwr95_HEEP2poolOb <- pwr::pwr.t2n.test(n1 = n1,
                  n2 = n2,
                  d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2poolOb
## 
##      t test power calculation 
## 
##              n1 = 1925
##              n2 = 875
##               d = 0.1142672
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
# which means a kWh difference of
# HEEP2pool
pwr95_HEEP2poolOb$d * rse
## [1] 1.063766
# report table
kableExtra::kable(obsT, digits = 2) %>%
  kable_styling()
Q7labAgg meankWh minkWh maxkWh sdkWh nHouseholds Threshold se error CI_lower CI_upper kWh
< 1999 23.25 9.92 45.92 10.09 22 p < 0.1 (observed n) 2.15 3.54 19.71 26.79 23.25
>= 2000 18.72 6.30 27.65 7.15 10 p < 0.1 (observed n) 2.26 3.72 15.00 22.44 18.72
< 1999 23.25 9.92 45.92 10.09 22 p < 0.05 (observed n) 2.15 4.22 19.04 27.47 23.25
>= 2000 18.72 6.30 27.65 7.15 10 p < 0.05 (observed n) 2.26 4.43 14.29 23.15 18.72

Heat pumps

Compare with/without

# GG
t <- table(hhDT$`Heat pump number`, useNA = "always")
t
## 
##    1    3 <NA> 
##   23    2   19
prop.table(t)
## 
##          1          3       <NA> 
## 0.52272727 0.04545455 0.43181818
# with elec data
dailyMeanLinkedDT[, heatPumps := `Heat pump number`]
dailyMeanLinkedDT[, heatPumps := ifelse(is.na(`Heat pump number`), 0, heatPumps)]
dailyMeanLinkedDT[, .(nHHs = uniqueN(linkID)), keyby = .(heatPumps)]
##    heatPumps nHHs
## 1:         0   14
## 2:         1   19
## 3:         3    1
obsT <- make_p95Table(dailyMeanLinkedDT[!is.na(heatPumps)], "heatPumps")
obsT[, kWh := meankWh]
make_CIplot(obsT, kwh = "meankWh", xVar = "heatPumps", xLab = "N heat pumps")
## Saving 6 x 5 in image

kableExtra::kable(obsT, digits = 3) %>%
  kable_styling()
heatPumps meankWh minkWh maxkWh sdkWh nHouseholds Threshold se error CI_lower CI_upper kWh
0 19.646 6.298 34.445 8.083 13 p < 0.1 (observed n) 2.242 3.687 15.959 23.333 19.646
1 22.693 9.921 45.924 10.032 18 p < 0.1 (observed n) 2.365 3.890 18.804 26.583 22.693
3 34.898 34.898 34.898 NA 1 p < 0.1 (observed n) NA NA NA NA 34.898
0 19.646 6.298 34.445 8.083 13 p < 0.05 (observed n) 2.242 4.394 15.252 24.040 19.646
1 22.693 9.921 45.924 10.032 18 p < 0.05 (observed n) 2.365 4.635 18.059 27.328 22.693
3 34.898 34.898 34.898 NA 1 p < 0.05 (observed n) NA NA NA NA 34.898
# switch to binary for HPs - yes or no
dailyMeanLinkedDT[, hasHeatPump := "No"]
dailyMeanLinkedDT[, hasHeatPump := ifelse(heatPumps > 0, "Yes", hasHeatPump)]
obsT <- make_p95Table(dailyMeanLinkedDT[!is.na(hasHeatPump)], "hasHeatPump")

# report table
kableExtra::kable(obsT, digits = 2) %>%
  kable_styling()
hasHeatPump meankWh minkWh maxkWh sdkWh nHouseholds Threshold se error CI_lower CI_upper
No 19.65 6.30 34.44 8.08 13 p < 0.1 (observed n) 2.24 3.69 15.96 23.33
Yes 23.34 9.92 45.92 10.14 19 p < 0.1 (observed n) 2.33 3.83 19.51 27.16
No 19.65 6.30 34.44 8.08 13 p < 0.05 (observed n) 2.24 4.39 15.25 24.04
Yes 23.34 9.92 45.92 10.14 19 p < 0.05 (observed n) 2.33 4.56 18.77 27.90
# What 'effect size' do we observe?
m1 <- obsT[hasHeatPump == "No" & Threshold %like% "0.05", meankWh]
m2 <- obsT[hasHeatPump == "Yes" & Threshold %like% "0.05", meankWh]

# common sd??
r <- lm(meankWh ~ hasHeatPump, data = dailyMeanLinkedDT)
results <- summary(r) # we need the rse https://online.stat.psu.edu/stat462/node/94/
rse <- results$sigma # rse

diff <- abs(m1-m2)
# Difference
diff
## [1] 3.689753
d <- diff/rse

n1 <- obsT[hasHeatPump == "No" & Threshold %like% "0.05", nHouseholds]
n2 <- obsT[hasHeatPump == "Yes" & Threshold %like% "0.05", nHouseholds]

# what effect size would we need for the GG n? p = 0.05
pwr95 <- pwr::pwr.t2n.test(n1 = n1,
                           n2 = n2,
                           d = , sig.level = 0.05, power = 0.8)
pwr95
## 
##      t test power calculation 
## 
##              n1 = 13
##              n2 = 19
##               d = 1.042146
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
pwr95$d
## [1] 1.042146
# which means a kWh difference of
pwr95$d * rse
## [1] 9.540305
# what effect size would we need for the GG n? p = 0.1
pwr90 <- pwr::pwr.t2n.test(n1 = n1,
                           n2 = n2,
                           d = , sig.level = 0.1, power = 0.8)
pwr90
## 
##      t test power calculation 
## 
##              n1 = 13
##              n2 = 19
##               d = 0.9158508
##       sig.level = 0.1
##           power = 0.8
##     alternative = two.sided
# which means a kWh difference of
pwr90$d * rse
## [1] 8.384138
# what effect size could we get with HEEP2full? p = 0.05
# Use HCS proportions
# HCS: 45% owned, 27% renters, overall = 38% (Vicki White and Mark Jones 2017)
n1 <- HEEP2full - (HEEP2full*0.38)
n2 <- HEEP2full*0.38
pwr95_HEEP2full <- pwr::pwr.t2n.test(n1 = n1,
                                     n2 = n2,
                                     d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2full
## 
##      t test power calculation 
## 
##              n1 = 173.6
##              n2 = 106.4
##               d = 0.3461309
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
#HEEP2full
# which means a kWh difference of
pwr95_HEEP2full$d * rse
## [1] 3.168648
# for drawing plot
getDrange <- function(nList,p){
  dt <- data.table::data.table()
  for(n in nList){
    n2 <- n * p # p = the proportion that have X
    n1 <- n - n2
    pres <- pwr::pwr.t2n.test(n1 = n1,
                             n2 = n2,
                             d = , sig.level = 0.05, power = 0.8)
    res <- as.data.table(c(n1,n2,n, pres$d))
    dt <- rbind(dt,transpose(res))
  }
  return(dt)
}

nList <- seq(100,3000,50)
p <- 0.38
resDT <- getDrange(nList, p)
resDT[, kWhDiff := V4 * rse]
pl <- ggplot2::ggplot(resDT, aes(x = V3, y = kWhDiff)) + 
  geom_line() +
  geom_point() +
  geom_vline(xintercept = HEEP2full, alpha = 0.3) +
  geom_vline(xintercept = HEEP2fullAvail, alpha = 0.3) +
  geom_vline(xintercept = HEEP2poolOb, alpha = 0.3) +
  labs(x = "Total sample size",
       y = "Mean kWh difference",
       caption = paste0("p = 0.05, power = 0.8\n",
                        "% with heat pump = ",100*p))
pl

ggplot2::ggsave("kWhDiff_rangeHeatPumps.png", pl, 
                width = 6, path = here::here("plots"))
## Saving 6 x 5 in image

Now calculate the different d & kWh for the sample sizes

# what effect size could we get with HEEP2poolAvail? p = 0.05
# HEEP2pool obtained - see table
n1 <- HEEP2fullAvail - (HEEP2fullAvail*0.38)
n2 <- HEEP2fullAvail*0.38
pwr95_HEEP2poolAvail <- pwr::pwr.t2n.test(n1 = n1,
                                       n2 = n2,
                                       d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2poolAvail
## 
##      t test power calculation 
## 
##              n1 = 434
##              n2 = 266
##               d = 0.2184447
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
# which means a kWh difference of
# HEEP2pool
pwr95_HEEP2poolAvail$d * rse
## [1] 1.999747
# what effect size could we get with HEEP2poolObtained? p = 0.05
# HEEP2pool obtained - see table

n1 <- HEEP2poolOb - (HEEP2poolOb*0.38)
n2 <- HEEP2poolOb*0.38
pwr95_HEEP2poolOb <- pwr::pwr.t2n.test(n1 = n1,
                                       n2 = n2,
                                       d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2poolOb
## 
##      t test power calculation 
## 
##              n1 = 1736
##              n2 = 1064
##               d = 0.1091407
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
# which means a kWh difference of
# HEEP2pool
pwr95_HEEP2poolOb$d * rse
## [1] 0.9991266

Compare morning vs evening

# >> compare am/pm ----
# use HP only data
# remove -ve values
heatPumpDT <- heatPumpDT[meanConsumptionkWh >= 0]

# check heat pump patterns
plotDT <- heatPumpDT[, .(meankWh = mean(meanConsumptionkWh)), 
                     keyby = .(r_halfHour, season)]
ggplot2::ggplot(plotDT, aes(x = r_halfHour, y = meankWh, colour = season)) +
  geom_line()

# Based on the line plot...

heatPumpDT[, period := ifelse(lubridate::hour(r_dateTimeHalfHour) >= 4 &
                                lubridate::hour(r_dateTimeHalfHour) < 10,
                                                "04:00 - 10:00", NA)]
heatPumpDT[, period := ifelse(lubridate::hour(r_dateTimeHalfHour) >= 16 &
                                lubridate::hour(r_dateTimeHalfHour) < 22,
                              "16:00 - 22:00", period)]

# check
with(heatPumpDT, table(lubridate::hour(r_dateTimeHalfHour),period))
##     period
##      04:00 - 10:00 16:00 - 22:00
##   0              0             0
##   1              0             0
##   2              0             0
##   3              0             0
##   4          52173             0
##   5          52200             0
##   6          52249             0
##   7          52390             0
##   8          52491             0
##   9          52273             0
##   10             0             0
##   11             0             0
##   12             0             0
##   13             0             0
##   14             0             0
##   15             0             0
##   16             0         52081
##   17             0         52114
##   18             0         52126
##   19             0         52142
##   20             0         52102
##   21             0         52079
##   22             0             0
##   23             0             0
dailyHeatPumpDT <- heatPumpDT[!is.na(period), .(meankWh = mean(meanConsumptionkWh), # use mean as some have multiple circuits
                                                nObs = .N), # how many obs?
                              keyby = .(r_date, period, linkID, season)]
# check
dailyHeatPumpDT[, .(meanSum = mean(meankWh)), 
                keyby = .(season, period)]
##    season        period    meanSum
## 1: Spring 04:00 - 10:00 0.07821803
## 2: Spring 16:00 - 22:00 0.06663372
## 3: Summer 04:00 - 10:00 0.05652707
## 4: Summer 16:00 - 22:00 0.02865547
## 5: Autumn 04:00 - 10:00 0.08025595
## 6: Autumn 16:00 - 22:00 0.05777627
## 7: Winter 04:00 - 10:00 0.23322393
## 8: Winter 16:00 - 22:00 0.17475496
ggplot2::ggplot(dailyHeatPumpDT, aes(y = meankWh, x = period)) + 
  geom_boxplot() +
  facet_wrap(season ~ .)

dailyMeanHeatPumpDT <- dailyHeatPumpDT[, .(meankWh = mean(meankWh),
                                           sdkWh = sd(meankWh)),
                                       keyby = .(linkID, period, season)]

dailyMeanHeatPumpDT[, .(mean = mean(meankWh)), 
                keyby = .(season, period)]
##    season        period       mean
## 1: Spring 04:00 - 10:00 0.07676568
## 2: Spring 16:00 - 22:00 0.07691729
## 3: Summer 04:00 - 10:00 0.04709558
## 4: Summer 16:00 - 22:00 0.02294221
## 5: Autumn 04:00 - 10:00 0.07036280
## 6: Autumn 16:00 - 22:00 0.06122734
## 7: Winter 04:00 - 10:00 0.22114839
## 8: Winter 16:00 - 22:00 0.17957456
setkey(dailyMeanHeatPumpDT, linkID)
dailyMeanHeatPumpDTLinkedDT <- dailyMeanHeatPumpDT[hhDT]

obsT <- make_p95Table(dailyMeanHeatPumpDTLinkedDT[!is.na(period)], groupVar = "period")

obsT[, kWh := meankWh]
p <- make_CIplot(obsT[season == "winter"], # winter only
                 kwh = "meankWh", xVar = "period", xLab = "Period")
## Saving 6 x 5 in image
p <- p + coord_flip() + labs(y = "Mean kWh")
ggplot2::ggsave(paste0("heatPumpByPeriod_Winter_CI.png"), p, 
                width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
kableExtra::kable(obsT, digits = 3) %>%
  kable_styling()
period meankWh minkWh maxkWh sdkWh nHouseholds Threshold se error CI_lower CI_upper kWh
04:00 - 10:00 0.104 0 0.483 0.112 27 p < 0.1 (observed n) 0.021 0.035 0.069 0.139 0.104
16:00 - 22:00 0.077 0 0.563 0.094 27 p < 0.1 (observed n) 0.018 0.030 0.048 0.107 0.077
04:00 - 10:00 0.104 0 0.483 0.112 27 p < 0.05 (observed n) 0.021 0.042 0.062 0.146 0.104
16:00 - 22:00 0.077 0 0.563 0.094 27 p < 0.05 (observed n) 0.018 0.035 0.042 0.113 0.077

Now power calcs - needs paired

# >> Power calcs - paired ----

# What 'effect size' do we observe?
m1 <- obsT[period == "04:00 - 10:00" & Threshold %like% "0.05", meankWh]
m2 <- obsT[period != "04:00 - 10:00" & Threshold %like% "0.05", meankWh]

# common sd??
# strictly speaking we have paired observations so is this correct?
r <- lm(meankWh ~ period, data = dailyMeanHeatPumpDTLinkedDT)
results <- summary(r) # we need the rse https://online.stat.psu.edu/stat462/node/94/
rse <- results$sigma # rse

diff <- abs(m1-m2)
# Difference
diff
## [1] 0.02644616
d <- diff/rse

getPairedD <- function(nList,rse){
  dt <- data.table::data.table()
  for(n in nList){
    pres <- pwr::pwr.t.test(n = n,
                              d = , sig.level = 0.05, power = 0.8,
                            type = c("paired"))
    res <- as.data.table(c(n, pres$d, pres$d * rse))
    dt <- rbind(dt,transpose(res))
  }
  return(dt)
}

nList <- c(uniqueN(dailyMeanHeatPumpDT$linkID), HEEP2full,HEEP2fullAvail,HEEP2poolOb)
getPairedD(nList, rse) # print out the d and kWh diff required for each n
##      V1         V2         V3
## 1:   29 0.53895500 0.05675347
## 2:  280 0.16800541 0.01769144
## 3:  700 0.10601270 0.01116343
## 4: 2800 0.05295149 0.00557594
nList <- seq(50,1000,50)
pairedDT <- getPairedD(nList, rse) # print out the d and kWh diff required for each n

pl <- ggplot2::ggplot(pairedDT, aes(x = V1, y = V3)) + 
  geom_line() +
  geom_point() +
  labs(x = "Sub-group sample size",
       y = "Mean kWh difference",
       caption = paste0("p = 0.05, power = 0.8"))
pl

ggplot2::ggsave("kWhDiff_rangeHeatPumpsSubgroups.png", pl, 
                width = 6, path = here::here("plots"))
## Saving 6 x 5 in image

Lighting

# GG
t <- table(hhDT$Q49_coded, useNA = "always")
t
## 
##                     Energy saving - cfl             Halogen        Incandescent 
##                   2                  24                   3                   9 
##                 LED                <NA> 
##                   6                   0
prop.table(t)
## 
##                     Energy saving - cfl             Halogen        Incandescent 
##          0.04545455          0.54545455          0.06818182          0.20454545 
##                 LED                <NA> 
##          0.13636364          0.00000000
# need to use lighting extract

dailyLightingDT <- lightingDT[, .(sumkWh = sum(meanConsumptionkWh)), 
                              keyby = .(r_date, linkID)]
dailyMeanLightingDT <- dailyLightingDT[, .(meankWh = mean(sumkWh),
                                           sdkWh = sd(sumkWh)),
                                       keyby = .(linkID)]
setkey(dailyMeanLightingDT, linkID)
dailyMeanLightingLinkedDT <- dailyMeanLightingDT[hhDT]

dailyMeanLightingLinkedDT[, .(nHHs = uniqueN(linkID)), keyby = .(Q49_coded)]
##              Q49_coded nHHs
## 1:                        2
## 2: Energy saving - cfl   24
## 3:             Halogen    3
## 4:        Incandescent    9
## 5:                 LED    6
dailyMeanLightingLinkedDT <- dailyMeanLightingLinkedDT[Q49_coded != "" & 
                                                         !is.na(meankWh)]

obsT <- make_p95Table(dailyMeanLightingLinkedDT[!is.na(Q49_coded)], groupVar = "Q49_coded")

obsT[, kWh := meankWh]
p <- make_CIplot(obsT, kwh = "meankWh", xVar = "Q49_coded", xLab = "Main lumen type")
## Saving 6 x 5 in image
p <- p + coord_flip() 
ggplot2::ggsave(paste0("Q49_coded_CI.png"), p, 
                width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
kableExtra::kable(obsT, digits = 3) %>%
  kable_styling()
Q49_coded meankWh minkWh maxkWh sdkWh nHouseholds Threshold se error CI_lower CI_upper kWh
Energy saving - cfl 2.015 0.110 8.562 2.308 12 p < 0.1 (observed n) 0.666 1.096 0.920 3.111 2.015
Halogen 2.262 1.779 2.745 0.683 2 p < 0.1 (observed n) 0.483 0.794 1.468 3.056 2.262
Incandescent 6.028 1.181 12.302 4.330 5 p < 0.1 (observed n) 1.937 3.185 2.842 9.213 6.028
LED 1.278 0.776 1.780 0.710 2 p < 0.1 (observed n) 0.502 0.826 0.452 2.103 1.278
Energy saving - cfl 2.015 0.110 8.562 2.308 12 p < 0.05 (observed n) 0.666 1.306 0.710 3.321 2.015
Halogen 2.262 1.779 2.745 0.683 2 p < 0.05 (observed n) 0.483 0.946 1.316 3.209 2.262
Incandescent 6.028 1.181 12.302 4.330 5 p < 0.05 (observed n) 1.937 3.796 2.232 9.823 6.028
LED 1.278 0.776 1.780 0.710 2 p < 0.05 (observed n) 0.502 0.984 0.294 2.261 1.278

Power calcs?

Proportions

# > Confidence intervals ----
z <- qnorm(0.975) # p = 0.05
p <- 0.4 #test a p
n <- 150
MoE <- z * sqrt(p*(1-p)/(n-1)) # calculate margin of error
MoE
## [1] 0.0786612
# > Power ----
# pwr.2p.test(h = , n = , sig.level =, power = ) 
# calculate for single sample
pwr::pwr.2p.test(h = , n = HEEP2full, sig.level = 0.05, power = 0.8) 
## 
##      Difference of proportion power calculation for binomial distribution (arcsine transformation) 
## 
##               h = 0.2367594
##               n = 280
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: same sample sizes
# but this produces h which needs converting back to %

# single sample test
# n = n in the single sample
pwr.p.test(h = , n = HEEP2full, sig.level = 0.05, power = 0.8)
## 
##      proportion power calculation for binomial distribution (arcsine transformation) 
## 
##               h = 0.1674264
##               n = 280
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided

Test n for different p and proportions within the single sample

# power.prop.test is easier to use
# calculate n for each group - e.g. % heat pumps in renters vs owner-occupiers
# 40% & 25%
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25, 
                       power = 0.8, sig.level = 0.01)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 226.2947
##              p1 = 0.4
##              p2 = 0.25
##       sig.level = 0.01
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25, 
                       power = 0.8, sig.level = 0.05)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 151.8689
##              p1 = 0.4
##              p2 = 0.25
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25, 
                       power = 0.8, sig.level = 0.10)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 119.509
##              p1 = 0.4
##              p2 = 0.25
##       sig.level = 0.1
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25, 
                       power = 0.8, sig.level = 0.20)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 87.00637
##              p1 = 0.4
##              p2 = 0.25
##       sig.level = 0.2
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
message("# 10% & 15%")
## # 10% & 15%
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15, 
                       power = 0.8, sig.level = 0.01)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 1020.47
##              p1 = 0.1
##              p2 = 0.15
##       sig.level = 0.01
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15, 
                       power = 0.8, sig.level = 0.05)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 685.5969
##              p1 = 0.1
##              p2 = 0.15
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15, 
                       power = 0.8, sig.level = 0.10)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 539.9264
##              p1 = 0.1
##              p2 = 0.15
##       sig.level = 0.1
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15, 
                       power = 0.8, sig.level = 0.20)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 393.5438
##              p1 = 0.1
##              p2 = 0.15
##       sig.level = 0.2
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Create a proportions plot and table

calculateProportionsMoE <- function(props, sig, samples){
  # calculate margins of error given prop, a range of significance thresholds (sigs) and sample sizes
  nProps <- length(props)
  nSamps <- length(samples)
  #initialise results array
  resultsArray <- array(numeric(nSamps*nProps),
                        dim=c(nSamps,nProps)
  )
  # loop over samples
  for (s in 1:nSamps){
    # loop over significance values
  for (p in 1:nProps){
    me <- qnorm(1-(sig/2)) * sqrt(props[p]*(1 - props[p])/(samples[s]-1))
    resultsArray[s,p] <- me # report effect size against sample size
  }
  }
  dt <- data.table::as.data.table(resultsArray) # convert to dt for tidying
  dt <- cbind(dt, samples)
  longDT <- data.table::as.data.table(reshape2::melt(dt, id=c("samples")))
  longDT <- data.table::setnames(longDT, "value", "moe")
  longDT <- data.table::setnames(longDT, "variable", "proportion")
  return(longDT) # returned the tidied & long form dt
}

samples <- seq(100,3000,10)
props <- c(0.1, 0.2, 0.3, 0.4, 0.5)
dt <- calculateProportionsMoE(props = props, # one sample
                                    sig = 0.05,
                                    samples = samples
                                    )
# need to recode vars
# must be an easier way
dt[, prop := ifelse(proportion == "V1", "10%", NA)]
dt[, prop := ifelse(proportion == "V2", "20%", prop)]
dt[, prop := ifelse(proportion == "V3", "30%", prop)]
dt[, prop := ifelse(proportion == "V4", "40%", prop)]
dt[, prop := ifelse(proportion == "V5", "50%", prop)]

p <- ggplot2::ggplot(dt, aes(x = samples, y = 100*moe, colour = prop)) +
  geom_point(alpha = 0.5) +
  geom_line() +
  labs(y = "Margin of error (+/-%)",
       x = "Sample N (single sample)") +
  scale_color_discrete(name="Proportion:") +
  theme(legend.position="bottom") +
  geom_vline(xintercept = HEEP2full, alpha = 0.3) +
  geom_vline(xintercept = HEEP2fullAvail, alpha = 0.3) +
  geom_vline(xintercept = HEEP2poolOb, alpha = 0.3)

p <- p + annotate(geom = "text", 
             x = HEEP2full, 
             y = 0.9*(max(p$data$moe)*100), 
             label = paste0("n =", HEEP2full), 
             hjust = "left") +
  annotate(geom = "text", 
           x = HEEP2fullAvail, 
           y = 0.8*(max(p$data$moe)*100), 
           label = paste0("n = ", HEEP2fullAvail), 
           hjust = "left") +
  annotate(geom = "text", 
           x = HEEP2poolOb, 
           y = 0.9*(max(p$data$moe)*100), 
           label = paste0("n = ", HEEP2poolOb), 
           hjust = "right") 

ggplot2::ggsave("proportionsMoE.png", p, 
                width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
p

# details
dt[samples == HEEP2full]
##    samples proportion        moe prop
## 1:     280         V1 0.03520199  10%
## 2:     280         V2 0.04693599  20%
## 3:     280         V3 0.05377193  30%
## 4:     280         V4 0.05748461  40%
## 5:     280         V5 0.05866999  50%
dt[samples == HEEP2fullAvail]
##    samples proportion        moe prop
## 1:     700         V1 0.02223979  10%
## 2:     700         V2 0.02965306  20%
## 3:     700         V3 0.03397185  30%
## 4:     700         V4 0.03631743  40%
## 5:     700         V5 0.03706632  50%
dt[samples == HEEP2poolOb]
##    samples proportion        moe prop
## 1:    2800         V1 0.01111394  10%
## 2:    2800         V2 0.01481858  20%
## 3:    2800         V3 0.01697682  30%
## 4:    2800         V4 0.01814898  40%
## 5:    2800         V5 0.01852323  50%

The end